Loading required libraries
library(Hmisc)
library(caTools)
library(randomForest)
library(ggplot2)
library(plotly)
library(e1071)
library(ROCR)
library(pROC)
Reading and summarizaiton of data
data <- read.csv("results.csv")
data<-data[(data$Year>="2008"),]
summary(data)
## date home_team away_team home_score
## 2/29/12 : 66 Mexico : 112 Costa Rica: 90 Min. : 0.000
## 3/29/16 : 63 Qatar : 110 Zambia : 89 1st Qu.: 0.000
## 3/26/08 : 60 Japan : 106 Korea DPR : 83 Median : 1.000
## 3/5/14 : 59 USA : 106 Cameroon : 80 Mean : 1.585
## 11/14/12: 56 South Africa: 102 Iraq : 79 3rd Qu.: 2.000
## 10/11/11: 54 Oman : 97 Syria : 79 Max. :17.000
## (Other) :9593 (Other) :9318 (Other) :9451
## away_score tournament
## Min. : 0.000 Friendly :3736
## 1st Qu.: 0.000 FIFA World Cup qualification :2314
## Median : 1.000 UEFA Euro qualification : 516
## Mean : 1.095 African Cup of Nations qualification: 453
## 3rd Qu.: 2.000 AFC Asian Cup qualification : 201
## Max. :20.000 African Cup of Nations : 189
## (Other) :2542
## city country neutral Year
## Doha : 166 USA : 374 Mode :logical Min. :2008
## Dar es Salaam: 84 South Africa: 320 FALSE:7042 1st Qu.:2010
## London : 84 France : 235 TRUE :2909 Median :2013
## Muscat : 83 Qatar : 181 Mean :2013
## Amman : 79 England : 170 3rd Qu.:2015
## Kampala : 79 Sweden : 161 Max. :2018
## (Other) :9376 (Other) :8510
## Month
## Min. : 1.000
## 1st Qu.: 5.000
## Median : 7.000
## Mean : 6.956
## 3rd Qu.:10.000
## Max. :12.000
##
How Data Looks:
head(data)
## date home_team away_team home_score away_score tournament
## 29719 1/2/08 Kuwait Lebanon 3 2 Friendly
## 29720 1/5/08 Egypt Namibia 3 0 Friendly
## 29721 1/6/08 Tunisia Zambia 1 2 Friendly
## 29722 1/8/08 Tunisia Zambia 1 0 Friendly
## 29723 1/9/08 Nigeria Sudan 2 0 Friendly
## 29724 1/10/08 Egypt Mali 1 0 Friendly
## city country neutral Year Month
## 29719 Salmiya Kuwait FALSE 2008 1
## 29720 Aswan Egypt FALSE 2008 1
## 29721 Radès Tunisia FALSE 2008 1
## 29722 Radès Tunisia FALSE 2008 1
## 29723 Estepona Spain TRUE 2008 1
## 29724 Abu Dhabi United Arab Emirates TRUE 2008 1
#Checking Missing Values
missing <- as.data.frame(apply(data, 2, FUN = function(x) sum(is.na(x))))
colnames(missing) <- "Missing_Count"
missing$Missing_Per <- (missing$Missing_Count/(nrow(data))*100)
missing$Missing_Per <- round(missing$Missing_Per,0)
missing
## Missing_Count Missing_Per
## date 0 0
## home_team 0 0
## away_team 0 0
## home_score 0 0
## away_score 0 0
## tournament 0 0
## city 0 0
## country 0 0
## neutral 0 0
## Year 0 0
## Month 0 0
Exploratory Analysis
Top 10 Match Host
# Top ten match hosts
host<-as.data.frame(tail(sort(table(data$country)),10))
colnames(host) <- c("Country", "No_of_Matches")
# Draw plot
graph_1<-ggplot(host, aes(x=Country,y=No_of_Matches)) +
geom_bar(stat="identity", width=.75, fill="Red") +
labs(title="Top 10 Host",
subtitle="2008-2018",
caption="source: Kaggle")
graph_1

Home Advantage?
home_win <- (as.character(data$country) == as.character(data$home_team)) & (as.integer(data$home_score) >= as.integer(data$away_score))
away_win <- (as.character(data$country) == as.character(data$home_team)) & (as.integer(data$home_score) < as.integer(data$away_score))
home_win<-as.data.frame(table(home_win))
away_win<-as.data.frame(table(away_win))
graph_2_a <- plot_ly(home_win, x = ~home_win, y = ~Freq, type = 'bar',
marker = list(color = c('rgba(204,204,204,1)', 'rgba(222,45,38,0.8)'))) %>%
layout(title = "Home Wins",
yaxis = list(title = "# of Matches"))
graph_2_b <- plot_ly(away_win, x = ~away_win, y = ~Freq, type = 'bar',
marker = list(color = c('rgba(204,204,204,1)', 'rgba(222,45,38,0.8)'))) %>%
layout(title = "Away Wins",
yaxis = list(title = "# of Matches"))
graph_2_a
graph_2_b
BEST TEAMS OF ALL THE TIME
best_team <- (0)
for (i in 1:length(data$home_team))
{
x <- ifelse(data[i,4,] >= data[i,5,], as.character(data[i,2,]), as.character(data[i,3,]));
best_team <- c(best_team, x)
}
best_team<-as.data.frame(tail(sort(table(best_team)),10))
graph_3<-ggplot(best_team, aes(x=best_team,y=Freq)) +
geom_bar(stat="identity", width=.75, fill="Blue") +
coord_flip() +
labs(title="Top 10 Home Teams",
subtitle="2008-2018",
caption="source: Kaggle")
graph_3

Top 10 TOURNAMENTS
tournament<-as.data.frame(tail(sort(table(data$tournament)),10))
colnames(tournament) <- c("Tournament", "No_of_Matches")
graph_4<-plot_ly(tournament, x=~Tournament, y=~No_of_Matches, type = 'bar') %>% layout(title = "Top Tournaments")
graph_4
Brazil trend of playing matches over the years
temp<-data.frame(data$home_team,data$away_team,data$Year)
team<-"Brazil"
colnames(temp) <- c("home_team", "away_team","year")
temp <- temp[temp$home_team == team | temp$away_team ==team,]
team_trend<-as.data.frame(tail(sort(table(temp$year)),10))
colnames(team_trend) <- c("year", "freq")
team_trend$year<-as.character(team_trend$year)
graph_5<-ggplot(data=team_trend, aes(x=year, y=freq, group=1)) +
geom_line(colour="blue", linetype="dashed", size=1.5) +
geom_point(colour="blue", size=4, shape=21, fill="white")
graph_5

Reading data for top Teams, Stadiums and Tournaments
data <- read.csv("results.csv", stringsAsFactors = FALSE)
home<-read.csv("best_teams.csv")
country<-read.csv("country.csv")
tournament<-read.csv("tournament.csv")
Feature Engineering
Creating a winner variable
for(i in 1:length(data$home_team))
{
if(data$home_score[i]>data$away_score[i])
{
data$Winner[i]<-"Home"
}
else if(data$home_score[i]<data$away_score[i])
{
data$Winner[i]<-"Away"
}
else
{
data$Winner[i]<-"Tie"
}
}
head(data)
## date home_team away_team home_score away_score tournament city
## 1 1872-11-30 Scotland England 0 0 Friendly Glasgow
## 2 1873-03-08 England Scotland 4 2 Friendly London
## 3 1874-03-07 Scotland England 2 1 Friendly Glasgow
## 4 1875-03-06 England Scotland 2 2 Friendly London
## 5 1876-03-04 Scotland England 3 0 Friendly Glasgow
## 6 1876-03-25 Scotland Wales 4 0 Friendly Glasgow
## country neutral Year Month Winner
## 1 Scotland FALSE 1872 11 Tie
## 2 England FALSE 1873 3 Home
## 3 Scotland FALSE 1874 3 Home
## 4 England FALSE 1875 3 Tie
## 5 Scotland FALSE 1876 3 Home
## 6 Scotland FALSE 1876 3 Home
Binning of Home and Away Team
for ( i in 1:length(data$home_team))
{
if (data$home_team[i] %nin% home$Teams){
data[i,"home_team"] <- "others"
}
}
for ( i in 1:length(data$away_team))
{
if (data$away_team[i] %nin% home$Teams){
data[i,"away_team"] <- "others"
}
}
unique(data$home_team)
## [1] "Scotland" "England" "others"
## [4] "USA" "Uruguay" "Austria"
## [7] "Hungary" "Argentina" "Belgium"
## [10] "France" "Netherlands" "Switzerland"
## [13] "Sweden" "Germany" "Italy"
## [16] "Norway" "Russia" "Denmark"
## [19] "Brazil" "Japan" "Paraguay"
## [22] "Spain" "Poland" "Yugoslavia"
## [25] "Romania" "Portugal" "China"
## [28] "Australia" "Turkey" "Mexico"
## [31] "Egypt" "Bulgaria" "Kenya"
## [34] "Uganda" "Ireland" "Trinidad and Tobago"
## [37] "Zimbabwe" "Zambia" "Iran"
## [40] "Korea Republic" "Ghana" "Nigeria"
## [43] "Indonesia" "Tunisia" "Malawi"
## [46] "Morocco" "Ivory Coast" "Iraq"
## [49] "Thailand" "Senegal" "Algeria"
Binning of Tournaments
for ( i in 1:length(data$tournament))
{
if (data$tournament[i] %nin% tournament$Tournament){
data[i,"tournament"] <- "others"
}
}
unique(data$tournament)
## [1] "Friendly"
## [2] "British Championship"
## [3] "others"
## [4] "Copa América"
## [5] "Nordic Championship"
## [6] "International Cup"
## [7] "Baltic Cup"
## [8] "Balkan Cup"
## [9] "FIFA World Cup"
## [10] "FIFA World Cup qualification"
## [11] "CCCF Championship"
## [12] "AFC Asian Cup qualification"
## [13] "AFC Asian Cup"
## [14] "African Cup of Nations"
## [15] "Merdeka Tournament"
## [16] "UEFA Euro qualification"
## [17] "UEFA Euro"
## [18] "Windward Islands Tournament"
## [19] "African Cup of Nations qualification"
## [20] "Vietnam Independence Cup"
## [21] "UAFA Cup"
## [22] "South Pacific Games"
## [23] "King's Cup"
## [24] "Gulf Cup"
## [25] "Indonesia Tournament"
## [26] "Korea Cup"
## [27] "Oceania Nations Cup"
## [28] "CECAFA Cup"
## [29] "Kirin Cup"
## [30] "CFU Caribbean Cup qualification"
## [31] "CFU Caribbean Cup"
## [32] "Amílcar Cabral Cup"
## [33] "Nehru Cup"
## [34] "UDEAC Cup"
## [35] "Island Games"
## [36] "UNCAF Cup"
## [37] "Gold Cup"
## [38] "Confederations Cup"
## [39] "Oceania Nations Cup qualification"
## [40] "SAFF Cup"
## [41] "AFF Championship"
## [42] "Cyprus International Tournament"
## [43] "COSAFA Cup"
## [44] "Gold Cup qualification"
## [45] "WAFF Championship"
## [46] "EAFF Championship"
## [47] "AFC Challenge Cup"
## [48] "Viva World Cup"
## [49] "AFC Challenge Cup qualification"
## [50] "African Nations Championship"
## [51] "ConIFA World Football Cup"
Removal of matches( others vs others) and less important tournaments
data<-data[!(data$home_team=="others" & data$away_team=="others"),]
#24564
data<-data[!(data$tournament=="others"),]
#23911
Setting “Home” as country where Neutral is FALSE
for(i in 1:length(data$neutral))
{
if(data$neutral[i]=="FALSE")
{
data$country[i]<-"HOME"
}
}
Binning of Country (Excluding the Home teams)
for(i in 1:length(data$neutral))
{
if(data$neutral[i]=="TRUE")
{
if (data$country[i] %nin% country$Country){
data[i,"country"] <- "others"
}
}
}
unique(data$country)
## [1] "HOME" "others" "Chile"
## [4] "Philippines" "Soviet Union" "Finland"
## [7] "Tanganyika" "Hong Kong" "Zanzibar"
## [10] "Singapore" "Sudan" "United Arab Republic"
## [13] "Korea DPR" "India" "Netherlands Antilles"
## [16] "Ethiopia" "Lebanon" "Israel"
## [19] "Malaysia" "Kuwait" "Libya"
## [22] "Congo" "Haiti" "Tanzania"
## [25] "Pakistan" "Mozambique" "Cameroon"
## [28] "Syria" "Qatar" "Liberia"
## [31] "Mali" "Saudi Arabia" "Réunion"
## [34] "Swaziland" "Honduras" "United Arab Emirates"
## [37] "Oman" "Angola" "Zaïre"
## [40] "Jordan" "Burkina Faso" "Gabon"
## [43] "Canada" "Cyprus" "South Africa"
## [46] "Vietnam" "El Salvador" "Guatemala"
## [49] "Ukraine" "Equatorial Guinea"
Removing unnecessary columns not required for model
#Date
data<-data[-1]
#city
data<-data[-6]
#Score
data<-data[-3:-4]
#Year
data<-data[-6]
sapply(data, function(x) length(unique(x)))
## home_team away_team tournament country neutral Month
## 51 51 37 50 2 12
## Winner
## 3
sapply(data, class)
## home_team away_team tournament country neutral Month
## "character" "character" "character" "character" "logical" "integer"
## Winner
## "character"
converting variables into factor
data$home_team<-as.factor(data$home_team)
data$away_team<-as.factor(data$away_team)
data$tournament<-as.factor(data$tournament)
data$country<-as.factor(data$country)
data$Winner<-as.factor(data$Winner)
data$neutral<-as.factor(data$neutral)
data$Month<-as.factor(data$Month)
Model Creation
Splitting the data into Train and Test
set.seed(123)
split = sample.split(data$Winner, SplitRatio = 0.75)
train_set = subset(data, split == TRUE)
test_set = subset(data, split == FALSE)
Random Forest
Training the model
set.seed(123)
classifier = randomForest(x = train_set[-7],
y = train_set$Winner,
ntree = 500)
Testing the trained model
y_pred = predict(classifier, newdata = test_set[-7])
cm = table(test_set[, 7], y_pred)
calculating accuracy
n = sum(cm) # number of instances
diag = diag(cm) # number of correctly classified instances per class
accuracy = sum(diag) / n
accuracy*100
## [1] 53.98126
Naive Bayes
Training the model
classifier = naiveBayes(x = train_set[-7],
y = train_set$Winner)
Testing the model
y_pred = predict(classifier, newdata = test_set[-7])
cm = table(test_set[, 7], y_pred)
calculating accuracy
n = sum(cm) # number of instances
diag = diag(cm) # number of correctly classified instances per class
accuracy = sum(diag) / n
accuracy*100
## [1] 54.93476
ROC
y_pred<-as.data.frame(y_pred)
roc.home <- roc(ifelse(test_set$Winner=="Home", "Home", "non-Home"), as.numeric(y_pred$y_pred))
roc.away <- roc(ifelse(test_set$Winner=="Away", "Away", "non-Away"), as.numeric(y_pred$y_pred))
roc.tie <- roc(ifelse(test_set$Winner=="Tie", "Tie", "non-Tie"), as.numeric(y_pred$y_pred))
plot(roc.home, col = "green", main="ROC Curve")
lines(roc.away, col = "blue")
lines(roc.tie, col = "red")
legend("topleft", c("Home","Away","Tie"), fill=c("green","blue","red") )

AUC
auc(roc.home)
## Area under the curve: 0.3717
auc(roc.away)
## Area under the curve: 0.6452
auc(roc.tie)
## Area under the curve: 0.4864